require(quanteda)
require(stm)
require(estimatr)

#### covert the data to stm format ####
load("dfm_matrix.Rdata")

stm.data <- convert(dfm.matrix, to = "stm")
stm.data$meta$year <- factor(stm.data$meta$year)
stm.data$meta$party <- factor(stm.data$meta$party,
                              levels = c("others", "LDP", "JSP", "Komeito", "DSP", "JCP", "NFP", "DPJ", "left", "right"))
stm.data$meta$party.year <- paste(stm.data$meta$party, 
                                  stm.data$meta$year, sep = ".")

#### no-covariate model (correlated topic model) ####
set.seed(12345)
STM.result.no.covariate <- stm(documents = stm.data$documents, 
                               vocab = stm.data$vocab, K = 75)
save(STM.result.no.covariate, file = "STM_result_no_covariate.Rdata")

## confirm that the extracted topics are substantially the same as those found in the main analysis
# topics found in the main analysis
load("STM_result_model1_75.Rdata")
labelTopics(STM.result.model1, n = 5)$prob

# topics found by the no-covariate model
labelTopics(STM.result.no.covariate, n = 5)$prob
# confirm that the contents and the order of topics are the same as the original results
# so "topic.labels" and "distinctive.topic.id" in the main analysis can be used
topic.labels <- c("Political activity", "Ethical claims", "DPJ manifesto (2009)", 
                  "Criticism of the government (1993)", "Appeals of their sincerity", 
                  "Criticism of the government (2005)", "Postal privatization", "Energy policy", 
                  "Privatization of the JH", "Abolition of the consumption tax", "JCP manifesto (1996)", 
                  "Criticism of the government (2003)", "Appeals for youthfulness", 
                  "Criticism of the government (2009)", "Taxation in general", 
                  "Political reform", "Nursing", "Regime change", "Environmental issues", 
                  "NFP manifesto", "Criticism of the government (1990)", "JCP manifesto (2000)", 
                  "Criticism of the government (1986)", "Railways", "World affairs", 
                  "Decentralization", "Diplomacy in Asia", "Political ethics", "DPJ manifesto (2005)", 
                  "Reform within the LDP", "Elderly care", "Candidate information", "Future of Japan", 
                  "Community development", "Local development", "U.S. military bases", 
                  "Legislative records", "Abstract principles", "Health inequality", 
                  "JCP manifesto (2005)", "Appeals for change", "Policy support for mothers", 
                  "National problems in general", "Increasing domestic demand", 
                  "Regulation of political funds", "Agricultural administration", "Gender equality", 
                  "Urban development", "Science and technology", "Small and medium-sized enterprises", 
                  "Administration reform", "Economic measures", "Infrastructure", "Local tourism", 
                  "Appeals for novelty", "Coal mining", "Administrative efficiency", "Child care", 
                  "Appeals of their efforts", "Criticism of the government (2000)", 
                  "Freedom of information", "Agriculture and fisheries", "Fiscal reconstruction", 
                  "Criticism of the government (1996)", "Appeals for problem-solving ability", 
                  "Traffic network development", "Disaster prevention", "Title", "Pacifism", 
                  "Public pension", "Appeals for hearing voters' demands", 
                  "Policies of the Koizumi government", "Chugoku and Shikoku", 
                  "Pensions in general", "Aged society")
distinctive.topic.id <- c(42, 47, 58, 69, 46)

# five distinctive topics (Table A.1)
labelTopics(STM.result.no.covariate, n = 10)$prob[distinctive.topic.id, ]

#### post-estimation simulation ####
## conduct the post-estimation regression
post.reg.no.covariate <- list()
for (i in 1:75) {
  regression.data <- data.frame(stm.data$meta, 
                                theta = STM.result.no.covariate$theta[, i])
  post.reg.no.covariate[[i]] <- 
    summary(lm_robust(theta ~ female + year + party + age + I(age ^ 2) + incumbent + wins, 
                      regression.data, clusters = name))$coefficients
}

gender.diff.no.covariate <- matrix(NA, 75, 4)
for (i in 1:75) {
  gender.diff.no.covariate[i, ] <- post.reg.no.covariate[[i]][2, 1:4]  # the second row corresponds to the coefficient of the female dummy
}
rownames(gender.diff.no.covariate) <- topic.labels

# Table A.2
round(gender.diff.no.covariate[distinctive.topic.id, ], 3)

#### compute the first-differences (FDs) by year ####
## conduct the post-estimation regression
year.FD.reg.no.covariate <- list()
for (i in 1:5) {
  regression.data <- data.frame(stm.data$meta, 
                                theta = STM.result.no.covariate$theta[, distinctive.topic.id[i]])
  year.FD.reg.no.covariate[[i]] <- 
    lm_robust(theta ~ female * year + party.year + age + I(age ^ 2) + incumbent + wins, 
              regression.data, clusters = name)
}

## compute the estimates of year-by-year FDs and their standard errors
year.seq <- unique(docvars(dfm.matrix, "year"))
year.FD.est.no.covariate <- array(NA, c(8, 2, 5))
for (i in 1:5) {
  year.FD.est.no.covariate[1, 1, i] <- year.FD.reg.no.covariate[[i]]$coefficients[2]
  year.FD.est.no.covariate[1, 2, i] <- year.FD.reg.no.covariate[[i]]$std.error[2]
  for (j in 1:7) {
    year.FD.est.no.covariate[j + 1, 1, i] <- 
      year.FD.reg.no.covariate[[i]]$coefficients[2] + 
      year.FD.reg.no.covariate[[i]]$coefficients[74 + j]
    year.FD.est.no.covariate[j + 1, 2, i] <- 
      sqrt(year.FD.reg.no.covariate[[i]]$vcov[2, 2] + 
             year.FD.reg.no.covariate[[i]]$vcov[67 + j, 67 + j] + 
             2 * year.FD.reg.no.covariate[[i]]$vcov[2, 67 + j])
  }
}

# Figure A.7
cairo_pdf("Figure_A7.pdf", 
          width = 6, height = 4.2, pointsize = 8, family = "Helvetica")
layout(matrix(1:6, 2, 3, byrow = TRUE))
par(mar = c(4, 4, 2.5, 1.5), lwd = 0.5)
for (i in 1:5) {
  plot(NULL, NULL, type = "n", bty = "L", 
       xlim = c(1985, 2010), ylim = c(-0.02, 0.04), 
       main = topic.labels[distinctive.topic.id[i]], 
       xlab = "", ylab = "", xaxt = "n", yaxt = "n")
  segments(1994, -0.02, 1994, 0.03, lty = 3)
  text(1994, 0.03, "Electoral\nreform", pos = 3)
  abline(h = 0, lty = 3, col = "gray50")
  segments(year.seq, 
           year.FD.est.no.covariate[, 1, i] + 
             year.FD.est.no.covariate[, 2, i] * qnorm(0.025), 
           year.seq, 
           year.FD.est.no.covariate[, 1, i] + 
             year.FD.est.no.covariate[, 2, i] * qnorm(0.975))
  points(year.seq, year.FD.est.no.covariate[, 1, i], pch = 19)
  axis(1, at = year.seq, labels = FALSE, lwd = 0.5)
  mtext(c("86", "90", "93", "96", "00", "03", "05", "09"), 
        side = 1, line = 1, at = year.seq, cex = 0.7)
  mtext("Election year", side = 1, line = 2.5, cex = 0.8)
  axis(2, lwd = 0.5)
  mtext("Percentage points", side = 2, line = 2.5, cex = 0.8)
}
dev.off()